Transform user into vector space
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
We'll build a small program to generate a sample dataset of user that has schema below:
Wallet Address | Transaction Count | Transaction Volume | Avg Transaction Value | Transaction Frequency | Asset Diversity | Liquidity Provision | Staking Activity | Yield Farming Participation | Borrowing Value | Borrowing Frequency | Lending Value | Lending Frequency | Active Periods| Transaction Recency | Historical Activity Trends
import random
import string
def random_wallet_address():
return "0x" + ''.join(random.choices(string.ascii_letters + string.digits, k=40))
def random_time_frame():
return random.choice(['Daily', 'Weekly', 'Bi-weekly', 'Monthly', 'Never'])
def random_activity_trend():
return random.choice(['Increasing', 'Stable', 'Decreasing'])
def random_active_periods():
periods = [
'Weekdays 10am-4pm', 'Weekdays 6pm-10pm', 'Weekdays 9am-5pm',
'Weekdays 7am-9am', 'Weekends 1pm-5pm', 'Weekends 8pm-12am',
'Weekends 6pm-10pm', 'Weekdays 11am-3pm', 'Weekends 10am-2pm'
]
return random.choice(periods)
# Generate 100 records
data = []
for _ in range(100):
record = {
"Wallet Address": random_wallet_address(),
"Transaction Count": random.randint(10, 200),
"Transaction Volume": random.randint(1000, 50000),
"Avg Transaction Value": random.randint(200, 300),
"Transaction Frequency": random_time_frame(),
"Asset Diversity": random.randint(1, 10),
"Liquidity Provision": random.randint(1000, 20000),
"Staking Activity": random.randint(500, 10000),
"Yield Farming Participation": random.randint(0, 15000),
"Borrowing Value": random.randint(0, 5000),
"Borrowing Frequency": random_time_frame(),
"Lending Value": random.randint(0, 7000),
"Lending Frequency": random_time_frame(),
"Active Periods": random_active_periods(),
"Transaction Recency": random.choice(["1 day ago", "2 days ago", "3 days ago", "1 week ago", "2 weeks ago", "1 month ago"]),
"Historical Activity Trends": random_activity_trend()
}
data.append(record)
df = pd.DataFrame(data=data)
df.head(10)
$$u_i = f(x_i)$$
$f$:is a transform function.
$x_i$: is a features set of user.
$u_i$: is a vector of specific user.
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
def transform_function(df):
ordinal_features = ["Transaction Frequency", "Historical Activity Trends", "Transaction Recency"]
ordinal_encoder = OrdinalEncoder(categories=[["Never", "Monthly", "Bi-weekly", "Weekly", "Daily"], ["Decreasing", "Stable", "Increasing"], ["1 day ago", "2 days ago", "3 days ago", "1 week ago", "2 weeks ago", "1 month ago"]])
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])
onehot_features = ["Active Periods"]
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(df[onehot_features]).toarray()
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(onehot_features))
df = pd.concat([df.drop(columns=["Wallet Address", "Active Periods"]), onehot_encoded_df], axis=1)
return df
df = transform_function(df)
pd.options.display.max_colwidth = None
pd.options.display.max_columns = None
df = df.drop(columns=['Lending Frequency', 'Borrowing Frequency'])
df.head()
Assume we have a set of users that represented by matrix $P$ where each user is presented by vector $p$. Now, we want to list all of top (eg: 10) user that they are the most similar with user $p_i$.
From matrix $P$, we compute covariance matrix $Q$, that is formed: $$ \begin{aligned} Q = P \cdot P^T \\ \end{aligned} $$
The, the similar score of $p_i$ will be $q_i$, to get the top similarity of $p_i$: $$top_i = argmax(q_i)$$
(we also ignore product $i$ in the top_i as itself similar score is 1)
user_matrix = df.astype(float).to_numpy()
norm_products = user_matrix/np.linalg.norm(user_matrix, axis=1)[:, None]
Q = np.dot(norm_products, norm_products.T)
fig, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(Q, annot=True, fmt='.2f', cmap='RdYlGn')
plt.show()
Implement simple classification on this data shows the highest participation in staking...
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# Assuming the data has already been prepared and encoded as in the previous steps
# Define high participation in staking as above the median
threshold = df['Staking Activity'].median()
df['High Staking Participation'] = (df['Staking Activity'] > threshold).astype(int)
# Features and target
X = df.drop(columns=['Staking Activity', 'High Staking Participation'])
y = df['High Staking Participation']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
# Predictions
y_pred = clf.predict(X_test)
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')